Background

This file is designed to analyze coronavirus data from a single state using three data sources:

Data from each of these sources has been processed using code available in:

The goal of this file is to explore the measures for a single state. Data are generally available through late October (cases, deaths, tests, hospitalizations) and late August (all-cause deaths).

Running Analyses

The process uses tidyverse throughout, and otherwise calls functions using library::function(). Further, a variable mapping file is used for readable plot axis labels, and shared functions are made available:

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
source("./Coronavirus_Statistics_Functions_Shared_v003.R")

# Create a variable mapping file
varMapper <- c("cases"="Cases", 
               "newCases"="Increase in cases, most recent 30 days",
               "casesroll7"="Rolling 7-day mean cases", 
               "deaths"="Deaths", 
               "newDeaths"="Increase in deaths, most recent 30 days",
               "deathsroll7"="Rolling 7-day mean deaths", 
               "cpm"="Cases per million",
               "cpm7"="Cases per day (7-day rolling mean) per million", 
               "newcpm"="Increase in cases, most recent 30 days, per million",
               "dpm"="Deaths per million", 
               "dpm7"="Deaths per day (7-day rolling mean) per million", 
               "newdpm"="Increase in deaths, most recent 30 days, per million", 
               "hpm7"="Currently Hospitalized per million (7-day rolling mean)", 
               "tpm"="Tests per million", 
               "tpm7"="Tests per million per day (7-day rolling mean)", 
               "cdcExcess"="Excess all-cause (CDC)", 
               "ctp_death7"="COVID Tracking Project", 
               "usaf_death7"="USA Facts",
               "CDC_deaths"="CDC total deaths",
               "CDC_excess"="CDC excess deaths",
               "CTP_cases"="COVID Tracking Project cases",
               "CTP_deaths"="COVID Tracking Project deaths",
               "CTP_hosp"="COVID Tracking Project hospitalized",
               "CTP_tests"="COVID Tracking Project tests",
               "USAF_cases"="USA Facts cases", 
               "USAF_deaths"="USA Facts deaths",
               "vpm7"="Per million people (7-day rolling daily average)",
               "vpm"="Per million people"
               )

Data are loaded from files previously processed:

ctpList <- readFromRDS("test_hier5_201025")
usafData <- readFromRDS("cty_20201026")$clusterStateData
cdcList <- readFromRDS("cdcList_20201027")

A function is written to examine the alignment of death data across the data sources for a single state:

combineDeathData <- function(ctp, 
                             usaf, 
                             cdc, 
                             keyState, 
                             curYear=2020,
                             minDate=as.Date(paste0(curYear, "-01-01")), 
                             perMillion=FALSE,
                             glimpseIntermediate=FALSE, 
                             facetFreeY=!perMillion, 
                             returnData=TRUE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # ctp: the list with COVID Tracking Project data
    # usaf: the data frame with USA Facts data
    # cdc: the list with CDC data
    # keyState: the state(s) to be explored
    # curYear: current year
    # minDate: the minimum date to use in the CDC data
    # perMillion: boolean, should data be show on a per-million-people basis?
    # glimpseIntermediate: boolean, should glimpses of frames be provided as they are built?
    # facetFreeY: boolean, should facets be created with free_y scales (only relevant if 2+ keyStates)
    # returnData: boolean, should the data frame be returned?
    
    # STEP 0a: Extract relevant elements from lists (use frame as-is if directly passed)
    if ("list" %in% class(ctp)) ctp <- ctp[["consolidatedPlotData"]]
    if ("list" %in% class(usaf)) usaf <- usaf[["clusterStateData"]]
    if ("list" %in% class(cdc)) cdc <- cdc[["stateAgg"]]
    
    # STEP 0b: Create a mapping file of date to epiWeek
    epiMap <- tibble::tibble(date=seq.Date(from=minDate, to=as.Date(paste0(curYear, "-12-31")), by="1 day"), 
                             week=lubridate::epiweek(date)
                             )
    
    # STEP 1: Filter to only relevant data
    # STEP 1a: COVID Tracking Project
    ctp <- ctp %>%
        ungroup() %>%
        filter(name=="deaths", state %in% keyState)
    if(glimpseIntermediate) glimpse(ctp)
    # STEP 1b: USA Facts
    usaf <- usaf %>%
        ungroup() %>%
        filter(state %in% keyState)
    if(glimpseIntermediate) glimpse(usaf)
    # STEP 1c: CDC
    cdc <- cdc %>%
        ungroup() %>%
        filter(year==curYear, state %in% keyState)
    if(glimpseIntermediate) glimpse(cdc)
    
    # STEP 2a: Sum the county-level data so that it is state-level data
    usafState <- usaf %>%
        group_by(state, date) %>%
        summarize(deaths=sum(deaths), dpm7=sum(dpm7*pop)/sum(pop), pop=sum(pop)) %>%
        ungroup()
    # STEP 2b: Convert the CDC data to an estimated daily total (split the weekly total evenly)
    cdcDaily <- cdc %>%
        left_join(epiMap, by=c("week")) %>%
        select(state, week, date, cdcDeaths=deaths, cdcExcess=delta) %>%
        mutate(cdcDeaths=cdcDeaths/7, cdcExcess=cdcExcess/7)
    
    # STEP 3: Create a state death-level database by date
    dailyDeath <- select(ctp, state, date, ctpDeaths=value, ctp_dpm7=vpm7, ctp_pop=pop) %>%
        full_join(select(usafState, state, date, usafDeaths=deaths, usaf_dpm7=dpm7, usaf_pop=pop), 
                  by=c("state", "date")
                  ) %>%
        full_join(cdcDaily, by=c("state", "date")) %>%
        arrange(state, date) %>%
        mutate(ctp_death7=ctp_dpm7*ctp_pop/1000000, usaf_death7=usaf_dpm7*usaf_pop/1000000)
    if(glimpseIntermediate) glimpse(dailyDeath)

    # STEP 4a: Assign a population by state
    statePop <- dailyDeath %>%
        group_by(state) %>%
        summarize(pop=max(usaf_pop, ctp_pop, na.rm=TRUE))
    
    # STEP 4b: Plot the deaths data
    p1 <- dailyDeath %>%
        select(state, date, ctp_death7, usaf_death7, cdcExcess) %>%
        pivot_longer(-c(state, date), names_to="source", values_to="deaths") %>%
        filter(!is.na(deaths)) %>%
        left_join(statePop, by="state") %>%
        ggplot(aes(x=date, y=deaths*if(perMillion) (1000000/pop) else 1)) + 
        geom_line(aes(group=source, color=varMapper[source])) + 
        labs(x="", 
             y=paste0("Deaths", if(perMillion) " per million" else ""), 
             title=paste0(curYear, " deaths per day in ", paste0(keyState, collapse=", ")),
             subtitle=paste0("Rolling 7-day average", if(perMillion) " per million people" else ""),
             caption="CDC estimated excess all-cause deaths, weekly total divided by 7 to estimate daily total"
             ) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        scale_color_discrete("Data source") + 
        theme(legend.position="bottom") + 
        geom_hline(yintercept=0, lty=2)
    if (length(keyState) > 1) p1 <- p1 + facet_wrap(~state, scales=if(facetFreeY) "free_y" else "fixed")
    print(p1)
    
    # STEP 5: Return the daily death file
    if(returnData) dailyDeath
    
}

# Example function
combineDeathData(ctp=ctpList, 
                 usaf=usafData, 
                 cdc=cdcList, 
                 keyState=c("NY", "FL", "MI", "WI"), 
                 perMillion=FALSE, 
                 returnData=FALSE
                 )
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

combineDeathData(ctp=ctpList, 
                 usaf=usafData, 
                 cdc=cdcList, 
                 keyState=c("NY", "FL", "MI", "WI"), 
                 perMillion=TRUE, 
                 returnData=FALSE
                 )
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)

A different approach would be to combine all of the data, then to subset as needed for producing a given plot. Broadly, the data elements include:

Functions are written to create a main state-level database in the following format:

Functions and example code include:

# Function to convert a COVID Tracking Project file for further processing
prepCTPData <- function(ctp) {
    
    # FUNCTION AGRUMENTS:
    # ctp: a properly formatted list or data frame containing processed COVID Tracking Project data

    # Pull the relevant data frame if a list has been passed    
    if ("list" %in% class(ctp)) ctp <- ctp[["consolidatedPlotData"]]

    # Ungroup the data, delete the state named 'cluster', and Create a value7 metric
    ctp <- ctp %>%
        ungroup() %>%
        filter(state != "cluster") %>%
        mutate(value7=ifelse(is.na(vpm7), NA, vpm7*pop/1000000))
    
    # Split state-cluster-population as a separate file unique by state
    ctpDemo <- ctp %>%
        group_by(state, cluster) %>%
        summarize(pop=max(pop, na.rm=TRUE)) %>%
        ungroup()
    
    # Create a final data file with the key elements
    ctpData <- ctp %>%
        rename(metric=name) %>%
        mutate(source="CTP", name=paste0(source, "_", metric)) %>%
        select(state, date, metric, source, name, value, value7, vpm, vpm7)
    
    # Return the key data frames
    list(ctpDemo=ctpDemo, ctpData=ctpData)
    
}

ctpPrepped <- prepCTPData(ctpList)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
# Function to convert a USA Facts file for further processing
prepUSAFData <- function(usaf) {
    
    # FUNCTION AGRUMENTS:
    # usaf: a properly formatted list or data frame containing processed USA Facts data

    # Pull the relevant data frame if a list has been passed    
    if ("list" %in% class(usaf)) usaf <- usaf[["clusterStateData"]]

    # Sum the data to state, keeping only state-date-pop-cases-deaths, then pivot longer
    usaf <- usaf %>%
        group_by(state, date) %>%
        summarize(cases=sum(cases), deaths=sum(deaths), pop=sum(pop)) %>%
        ungroup() %>%
        pivot_longer(-c(state, date, pop), names_to="metric", values_to="value")
    
    # Create the rolling-7 for value, having grouped by state-pop-metric and sorted by date
    # Add the per million component
    usaf <- usaf %>%
        group_by(state, pop, metric) %>%
        arrange(date) %>%
        helperRollingAgg(origVar="value", newName="value7") %>%
        ungroup() %>%
        mutate(vpm=value*1000000/pop, vpm7=value7*1000000/pop)
    
    # Split state-pop as a separate file unique by state
    usafDemo <- usaf %>%
        group_by(state) %>%
        summarize(pop=max(pop, na.rm=TRUE)) %>%
        ungroup()
    
    # Create a final data file with the key elements
    usafData <- usaf %>%
        mutate(source="USAF", name=paste0(source, "_", metric)) %>%
        select(state, date, metric, source, name, value, value7, vpm, vpm7)
    
    # Return the key data frames
    list(usafDemo=usafDemo, usafData=usafData)
    
}

usafPrepped <- prepUSAFData(usafData)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Function to convert a CDC file for further processing
prepCDCData <- function(cdc, 
                        popData,
                        startYear=2020, 
                        startDate=as.Date(paste0(startYear, "-01-01")), 
                        endDate=as.Date(paste0(startYear, "-12-31"))
                        ) {
    
    # FUNCTION AGRUMENTS:
    # cdc: a properly formatted list or data frame containing processed CDC data
    # popData: a file containing fields state-pop
    # startYear: starting year (CDC data will be filtered for this year and later)
    # startDate: the starting date for use in the mapping file to create daily estimates
    # endDate: the ending date for use in the mapping file to create daily estimates

    # Pull the relevant data frame if a list has been passed    
    if ("list" %in% class(cdc)) cdc <- cdc[["stateAgg"]]

    # Create a mapping file of dates to epiweek-epiyear
    epiMap <- tibble::tibble(date=seq.Date(from=startDate, to=endDate, by="1 day"), 
                             year=lubridate::epiyear(date),
                             week=lubridate::epiweek(date)
                             )
    
    # Filter the data to the relevant year and keep state-year-week-deaths-excess
    cdc <- cdc %>%
        filter(yearint >= startYear) %>%
        select(state, yearint, week, deaths, excess=delta)

    # Merge in the daily mapping file, divide all totals by 7 to reflect weekly to daily, and pivot longer
    cdc <- cdc %>%
        left_join(epiMap, by=c("yearint"="year", "week"="week")) %>%
        mutate(deaths=deaths/7, excess=excess/7) %>%
        select(state, date, deaths, excess) %>%
        pivot_longer(-c(state, date), names_to="metric", values_to="value")
    
    # Create the rolling-7 for value, having grouped by state-metric and sorted by date
    # Add the per million component
    cdc <- cdc %>%
        group_by(state, metric) %>%
        arrange(date) %>%
        helperRollingAgg(origVar="value", newName="value7") %>%
        ungroup() %>%
        left_join(select(popData, state, pop), by="state") %>%
        mutate(vpm=value*1000000/pop, 
               vpm7=value7*1000000/pop, 
               source="CDC", 
               name=paste0(source, "_", metric)
               ) %>%
        select(state, date, pop, metric, source, name, value, value7, vpm, vpm7)
    
    # Return the key data frame as a list
    list(cdcDemo=select(cdc, state, pop), cdcData=select(cdc, -pop))
    
}

# Create an integrated state demographics file
demoData <- ctpPrepped$ctpDemo %>%
    rename(popCTP=pop) %>%
    full_join(rename(usafPrepped$usafDemo, popUSAF=pop), by="state") %>%
    mutate(pop=pmax(popCTP, popUSAF))

cdcPrepped <- prepCDCData(cdcList, popData=demoData)



# Integrated state data
stateData <- ctpPrepped$ctpData %>%
    bind_rows(usafPrepped$usafData) %>%
    bind_rows(cdcPrepped$cdcData)
glimpse(stateData)
## Rows: 98,931
## Columns: 9
## $ state  <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date   <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ metric <chr> "cases", "cases", "cases", "cases", "cases", "cases", "cases...
## $ source <chr> "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP...
## $ name   <chr> "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cas...
## $ value  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 3, 0, 6, 2, 8, 0, 14, 6,...
## $ value7 <dbl> NA, NA, NA, 0.0000000, 0.1428571, 0.1428571, 0.1428571, 0.14...
## $ vpm    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, ...
## $ vpm7   <dbl> NA, NA, NA, 0.0000000, 0.1934601, 0.1934601, 0.1934601, 0.19...
# Control totals
stateData %>%
    group_by(name) %>%
    summarize(value=sum(value, na.rm=TRUE), value7=sum(value7, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 3
##   name             value     value7
##   <chr>            <dbl>      <dbl>
## 1 CDC_deaths    2166309.   2114652.
## 2 CDC_excess     236718.    234468.
## 3 CTP_cases     8464908    8243331.
## 4 CTP_deaths     215758     212969.
## 5 CTP_hosp      8632281    8405618.
## 6 CTP_tests   130976369  127573841.
## 7 USAF_cases    8395819    8195223.
## 8 USAF_deaths    221247     218751.

Most of the metrics included are point estimates rather than cumulative. The notable exception is CTP_hosp which is the total number of people in the census states (not all report) that are currently hospitalized with coronavirus. This is a flow metric that is influenced by new admissions and declines driven by various factors such as death, recovery, discharge to a care facility, etc.

A function is written to allow for plotting of data, optionally facetted by state:

plotStateMetric <- function(df, 
                            yVal, 
                            namesPlot, 
                            keyStates, 
                            plotTitle=NULL,
                            plotSub=NULL,
                            plotCaption=NULL,
                            facetFixed=TRUE,
                            mapper=varMapper
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame with integrated state data
    # yVal: column to use for the yValues
    # namesPlot: the values of column 'name' to be kept and plotted
    # keyStates: states to be included
    #            if more than one state is passed, facets will be created
    # plotTitle: plot title to be used (NULL means none)
    # plotSub: plot subtitle to be used (NULL means none)
    # plotCaption: plot caption to be used (NULL means none)
    # facetFixed: boolean, if TRUE scales="fixed", if FALSE scales="free_y"
    #             only relevant if length(keyStates) > 1
    
    # Filter the data for only the key elements
    df <- df %>%
        select_at(vars(all_of(c("state", "date", "name", yVal)))) %>%
        filter(state %in% keyStates, name %in% namesPlot)
    
    # Create the relevant line plot
    p1 <- df %>%
        filter(!is.na(get(yVal))) %>%
        ggplot(aes_string(x="date", y=yVal)) + 
        geom_line(aes(group=name, color=mapper[name])) + 
        labs(x="", 
             y=mapper[yVal]
             ) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        scale_color_discrete("Source and metric") + 
        geom_hline(aes(yintercept=0), lty=2)
    if (length(keyStates) > 1) p1 <- p1 + facet_wrap(~state, scales=if(facetFixed) "fixed" else "free_y")
    if (!is.null(plotTitle)) p1 <- p1 + labs(title=plotTitle)
    if (!is.null(plotSub)) p1 <- p1 + labs(subtitle=plotSub)
    if (!is.null(plotCaption)) p1 <- p1 + labs(caption=plotCaption)
    print(p1)
    
}

plotStateMetric(stateData, 
                yVal="vpm7", 
                namesPlot=c("CDC_excess", "CTP_deaths", "USAF_deaths"), 
                keyStates=c("NY", "FL", "MI", "WI"), 
                plotTitle="2020 coronavirus deaths per million per day (select states)", 
                plotSub="Rolling 7-day average per million people", 
                plotCaption=paste0("Linear model used to estimate excess CDC all-cause deaths by week\n", 
                                   "CDC weekly converted to daily by assigning 1/7 to each day of the week\n", 
                                   "Converted to rolling 7 mean for smoothing"
                                   ), 
                facetFixed=TRUE
                )

Cases per million can be assessed using the same function:

plotStateMetric(stateData, 
                yVal="vpm7", 
                namesPlot=c("CTP_cases", "USAF_cases"), 
                keyStates=c("NY", "FL", "MI", "WI"), 
                plotTitle="2020 coronavirus cases per million per day (select states)", 
                plotSub="Rolling 7-day average per million people", 
                facetFixed=TRUE
                )

Tests per million can be assessed using the same function:

plotStateMetric(stateData, 
                yVal="vpm7", 
                namesPlot=c("CTP_tests"), 
                keyStates=c("NY", "FL", "MI", "WI"), 
                plotTitle="2020 coronavirus tests per million per day (select states)", 
                plotSub="Rolling 7-day average per million people", 
                facetFixed=TRUE
                )

Hospitalized per million can be assessed using the same function:

plotStateMetric(stateData, 
                yVal="vpm", 
                namesPlot=c("CTP_hosp"), 
                keyStates=c("NY", "FL", "MI", "WI"), 
                plotTitle="2020 total hospitalized per million per day (select states)", 
                facetFixed=TRUE
                )

Suppose that the goal is to use two axes to plot metrics with fundamentally differing scales (e.g., cases and deaths). The function plotStateMetric() is updated to allow for a secondary axis:

plotStateMetric <- function(df, 
                            yVal, 
                            namesPlot, 
                            keyStates, 
                            namesSec=NULL,
                            scaleSec=NULL,
                            plotTitle=NULL,
                            plotSub=NULL,
                            plotCaption=NULL,
                            primYLab=NULL,
                            secYLab="Caution, different metric and scale",
                            facetFixed=TRUE,
                            mapper=varMapper
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame with integrated state data
    # yVal: column to use for the yValues
    # namesPlot: the values of column 'name' to be kept and plotted
    # keyStates: states to be included
    #            if more than one state is passed, facets will be created
    # namesSec: names to be plotted on a secondary y-axes
    # scaleSec: scale to be used for the secondary axis 
    #           namesSec/scaleSec should be similar in magnitude to namesPlot
    # plotTitle: plot title to be used (NULL means none)
    # plotSub: plot subtitle to be used (NULL means none)
    # plotCaption: plot caption to be used (NULL means none)
    # primYLab: primary y label (NULL means use mapper)
    # secYLab: secondary y label (default is "Caution, different metric and scale")
    # facetFixed: boolean, if TRUE scales="fixed", if FALSE scales="free_y"
    #             only relevant if length(keyStates) > 1
    
    # Routine is only set up for a secondary axis with facetFixed=TRUE
    if (!is.null(namesSec) & !facetFixed) stop("\nSecondary axis only programmed for scales='fixed'\n")
    
    # Include variables in namesSec as part of namesPlot so they are kept by filter
    if (!is.null(namesSec)) namesPlot <- unique(c(namesPlot, namesSec))
    
    # Filter the data for only the key elements
    df <- df %>%
        select_at(vars(all_of(c("state", "date", "name", yVal)))) %>%
        filter(state %in% keyStates, name %in% namesPlot)
    
    # If there is a secondary scale but no scaleSec has been passed, create one
    if (!is.null(namesSec) & is.null(scaleSec)) {
        maxPrimary <- df %>%
            filter(name %in% setdiff(namesPlot, namesSec)) %>%
            summarize(max(get(yVal), na.rm=TRUE)) %>%
            max()
        maxSecondary <- df %>%
            filter(name %in% namesSec) %>%
            summarize(max(get(yVal), na.rm=TRUE)) %>%
            max()
        scaleSec <- maxSecondary/maxPrimary
        cat("\nWill scale by:", scaleSec, "\n")
    }
    
    # Create the primary y-axis label from mapper if it has not been passed
    if (is.null(primYLab)) primYLab <- mapper[yVal]
    
    # Create the relevant line plot
    p1 <- df %>%
        filter(!is.na(get(yVal))) %>%
        ggplot(aes_string(x="date")) + 
        geom_line(data=~filter(., name %in% setdiff(namesPlot, namesSec)), 
                  aes(y=get(yVal), group=name, color=mapper[name])
                  ) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        geom_hline(aes(yintercept=0), lty=2)
    if (!is.null(namesSec)) {
        p1 <- p1 + 
            geom_line(data=~filter(., name %in% namesSec), 
                      aes(y=get(yVal)/scaleSec, color=mapper[name], group=name)
                      ) + 
            scale_y_continuous(name=primYLab, 
                               sec.axis=sec_axis(~.*scaleSec, name=secYLab)
                               )
    } else {
        p1 <- p1 + scale_y_continuous(name=primYLab)
    }
    if (length(keyStates) > 1) p1 <- p1 + facet_wrap(~state, scales=if(facetFixed) "fixed" else "free_y")
    if (!is.null(plotTitle)) p1 <- p1 + labs(title=plotTitle)
    if (!is.null(plotSub)) p1 <- p1 + labs(subtitle=plotSub)
    if (!is.null(plotCaption)) p1 <- p1 + labs(caption=plotCaption)
    p1 <- p1 + scale_color_discrete("Source and metric")
    print(p1)
    
}


plotStateMetric(stateData, 
                yVal="vpm7", 
                namesPlot=c("CDC_excess", "USAF_deaths", "CTP_deaths"),
                namesSec=c("CTP_hosp"), 
                keyStates=c("LA", "TX", "MI", "WI"), 
                plotTitle="2020 coronavirus burden per million per day (select states)", 
                plotSub="Deaths on main y-axis, total hospitalized on secondary y-axis", 
                primYLab="Cases per million (7-day rolling mean)",
                secYLab="Total hospitalized per million (7-day rolling mean)",
                facetFixed=TRUE
                )
## 
## Will scale by: 21.6566

plotStateMetric(stateData, 
                yVal="vpm7", 
                namesPlot=c("USAF_cases", "CTP_cases"),
                namesSec=c("CTP_hosp"), 
                keyStates=c("LA", "TX", "MI", "WI"), 
                plotTitle="2020 coronavirus burden per million per day (select states)", 
                plotSub="Cases on main y-axis, total hospitalized on secondary y-axis", 
                primYLab="Cases per million (7-day rolling mean)",
                secYLab="Total hospitalized per million (7-day rolling mean)",
                facetFixed=TRUE
                )
## 
## Will scale by: 0.5920434

plotStateMetric(stateData, 
                yVal="vpm7", 
                namesPlot=c("CTP_cases"),
                namesSec=c("CTP_deaths"), 
                keyStates=c("LA", "TX", "MI", "WI"), 
                plotTitle="2020 coronavirus burden per million per day (select states)", 
                plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                primYLab="Cases per million (7-day rolling mean)",
                secYLab="Deaths per million (7-day rolling mean)",
                facetFixed=TRUE
                )
## 
## Will scale by: 0.02138943

The function plotStateMetric() is updated so that multiple states can be combined and tracked as a single entity:

plotStateMetric <- function(df, 
                            yVal, 
                            namesPlot, 
                            keyStates, 
                            namesSec=NULL,
                            scaleSec=NULL,
                            plotTitle=NULL,
                            plotSub=NULL,
                            plotCaption=NULL,
                            primYLab=NULL,
                            secYLab="Caution, different metric and scale",
                            facetFixed=TRUE,
                            mapper=varMapper, 
                            combStates=vector("character", 0), 
                            popData=NULL, 
                            yValPerCap=(yVal %in% c("vpm", "vpm7")), 
                            printPlot=TRUE, 
                            returnData=FALSE
                            ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame with integrated state data
    # yVal: column to use for the yValues
    # namesPlot: the values of column 'name' to be kept and plotted
    # keyStates: states to be included
    #            if more than one state is passed, facets will be created
    # namesSec: names to be plotted on a secondary y-axes
    # scaleSec: scale to be used for the secondary axis 
    #           namesSec/scaleSec should be similar in magnitude to namesPlot
    # plotTitle: plot title to be used (NULL means none)
    # plotSub: plot subtitle to be used (NULL means none)
    # plotCaption: plot caption to be used (NULL means none)
    # primYLab: primary y label (NULL means use mapper)
    # secYLab: secondary y label (default is "Caution, different metric and scale")
    # facetFixed: boolean, if TRUE scales="fixed", if FALSE scales="free_y"
    #             only relevant if length(keyStates) > 1
    # mapper: mapping file for variable names to descriptive names
    # combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
    # popData: a population file for combining states
    # yValPerCap: boolean, is the y-value of type per-capita?
    # printPlot: boolean, whether to print the plots
    # returnData: boolean, whether to return the data
    
    # Routine is only set up for a secondary axis with facetFixed=TRUE
    if (!is.null(namesSec) & !facetFixed) stop("\nSecondary axis only programmed for scales='fixed'\n")
    
    # Include variables in namesSec as part of namesPlot so they are kept by filter
    if (!is.null(namesSec)) namesPlot <- unique(c(namesPlot, namesSec))
    
    # Filter the data for only the key elements
    df <- df %>%
        select_at(vars(all_of(c("state", "date", "name", yVal)))) %>%
        filter(state %in% keyStates, name %in% namesPlot)
    
    # If there is a list of states to combine, process them
    if (length(combStates) > 0) {
        if (is.null(popData)) { stop("\nCombining states requires population data\n") }
        # Create a data frame with population and new state names
        df <- df %>%
            left_join(select(popData, state, pop), by="state") %>%
            mutate(state=ifelse(state %in% names(combStates), combStates[state], state))
        # Aggregate to the new 'state' level data
        if (yValPerCap) {
            df <- df %>%
                group_by(state, date, name) %>%
                summarize(!!yVal:=sum(get(yVal)*pop)/sum(pop), pop=sum(pop))
        } else {
            df <- df %>%
                group_by(state, date, name) %>%
                summarize(!!yVal:=sum(get(yVal)), pop=sum(pop))
        }
        # Ungroup data frame
        df <- df %>%
            ungroup()
    }
    
    # If there is a secondary scale but no scaleSec has been passed, create one
    if (!is.null(namesSec) & is.null(scaleSec)) {
        maxPrimary <- df %>%
            filter(name %in% setdiff(namesPlot, namesSec)) %>%
            summarize(max(get(yVal), na.rm=TRUE)) %>%
            max()
        maxSecondary <- df %>%
            filter(name %in% namesSec) %>%
            summarize(max(get(yVal), na.rm=TRUE)) %>%
            max()
        scaleSec <- maxSecondary/maxPrimary
        cat("\nWill scale by:", scaleSec, "\n")
    }
    
    # Create the primary y-axis label from mapper if it has not been passed
    if (is.null(primYLab)) primYLab <- mapper[yVal]
    
    # Create the relevant line plot
    if (printPlot) {
        p1 <- df %>%
            filter(!is.na(get(yVal))) %>%
            ggplot(aes_string(x="date")) + 
            geom_line(data=~filter(., name %in% setdiff(namesPlot, namesSec)), 
                      aes(y=get(yVal), group=name, color=mapper[name])
                      ) + 
            scale_x_date(date_breaks="1 month", date_labels="%b") + 
            geom_hline(aes(yintercept=0), lty=2) + 
            labs(x="") + 
            theme(axis.text.x = element_text(angle = 90))
        if (!is.null(namesSec)) {
            p1 <- p1 + 
                geom_line(data=~filter(., name %in% namesSec), 
                          aes(y=get(yVal)/scaleSec, color=mapper[name], group=name)
                          ) + 
                scale_y_continuous(name=primYLab, 
                                   sec.axis=sec_axis(~.*scaleSec, name=secYLab)
                                   )
        } else {
            p1 <- p1 + scale_y_continuous(name=primYLab)
        }
        if (length(keyStates) > 1) p1 <- p1 + facet_wrap(~state, scales=if(facetFixed) "fixed" else "free_y")
        if (!is.null(plotTitle)) p1 <- p1 + labs(title=plotTitle)
        if (!is.null(plotSub)) p1 <- p1 + labs(subtitle=plotSub)
        if (!is.null(plotCaption)) p1 <- p1 + labs(caption=plotCaption)
        p1 <- p1 + scale_color_discrete("Source and metric")
        print(p1)
    }
    
    if (returnData) return(df)
    
}
# Example of combining states (created below with data returned)
plotStateMetric(stateData, 
                yVal="vpm7", 
                namesPlot=c("CTP_cases"),
                namesSec=c("CTP_deaths"), 
                keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
                combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", "NH"="N NE", "VT"="N NE", "ME"="N NE"),
                plotTitle="2020 coronavirus burden per million per day (select states)", 
                plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                primYLab="Cases per million (7-day rolling mean)",
                secYLab="Deaths per million (7-day rolling mean)",
                facetFixed=TRUE, 
                popData=usafPrepped$usafDemo
                )

The sparsely populated states of northern New England (Maine, New Hampshire, Vermont) have very different coronavirus burden than the more densely populated states of southern New England (Massachusetts, Connecticut, Rhode Island).

The function is updated so that it can create a plot and/or return a dataset:

# Example of combining states
ne_casedeath <- plotStateMetric(stateData, 
                                yVal="vpm7", 
                                namesPlot=c("CTP_cases"),
                                namesSec=c("CTP_deaths"), 
                                keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
                                combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", 
                                             "NH"="N NE", "VT"="N NE", "ME"="N NE"
                                             ),
                                plotTitle="2020 coronavirus burden per million per day (select states)", 
                                plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                primYLab="Cases per million (7-day rolling mean)",
                                secYLab="Deaths per million (7-day rolling mean)",
                                facetFixed=TRUE, 
                                popData=usafPrepped$usafDemo,
                                returnData=TRUE
                                )
## `summarise()` regrouping output by 'state', 'date' (override with `.groups` argument)
## 
## Will scale by: 0.07901078

ne_casedeath
## # A tibble: 2,010 x 5
##    state date       name        vpm7     pop
##    <chr> <date>     <chr>      <dbl>   <dbl>
##  1 N NE  2020-03-04 CTP_cases     NA 1330608
##  2 N NE  2020-03-04 CTP_deaths    NA 1330608
##  3 N NE  2020-03-05 CTP_cases     NA 1330608
##  4 N NE  2020-03-05 CTP_deaths    NA 1330608
##  5 N NE  2020-03-06 CTP_cases     NA 1956650
##  6 N NE  2020-03-06 CTP_deaths    NA 1956650
##  7 N NE  2020-03-07 CTP_cases     NA 3285978
##  8 N NE  2020-03-07 CTP_deaths    NA 3285978
##  9 N NE  2020-03-08 CTP_cases     NA 3285978
## 10 N NE  2020-03-08 CTP_deaths    NA 3285978
## # ... with 2,000 more rows

Suppose that the goal is to come up with a single transform on two dimensions to best match the curves to each other. One dimension is time (up to 30 days) and the other dimension is magnitude:

alignCurves <- function(df, 
                        valueMetric, 
                        depName,
                        indepName=setdiff(unique(df$name), depName),
                        lagsTry=0:30, 
                        yLabel="Deaths per million", 
                        depLabel="cases",
                        textMetric=stringr::str_split(yLabel, pattern=" ")[[1]][1] %>% stringr::str_to_lower()
                        ) {
    
    # FUNCTION ARGUMENTS
    # df: a data frame containing state-date-name-valueMetric, with only 2 value types in 'name'
    # valueMetric: the name of the value metric
    # depName: the name of the dependent variable (the other will be the predictor)
    # indepName: the name of the predictor variable
    # lagsTry: the lagged values to attempt
    # yLabel: label for the y-axis
    # depLabel: label for the title (regression x-variable name)
    # textMetric: label for the title (regression y-variable name)
    
    # Check that there are only two values in column 'name'
    if (length(unique(df$name))!=2) { stop("\nFunction depends on 'name' having only two possible values\n") }
    
    # Arrange the data by state and date
    df <- df %>%
        arrange(state, date)
    
    # Function to make a data frame with a specific lag
    helperMakeLagData <- function(df, depName, indepName, valueMetric, lagValue) {
        depData <- df %>%
            filter(name==depName) %>%
            select_at(vars(all_of(c("state", "date", valueMetric)))) %>%
            purrr::set_names(c("state", "date", "depVar"))
        indepData <- df %>%
            filter(name==indepName) %>%
            group_by(state) %>%
            mutate(indepVar=lag(get(valueMetric), lagValue)) %>%
            ungroup() %>%
            select(state, date, indepVar)
        fullData <- depData %>%
            full_join(indepData, by=c("state", "date"))
        fullData
    }
    
    # Run a simple linear model for depName ~ lag(otherName, lagsTry) to assess performance
    lmResults <- vector("list", length(lagsTry))
    n <- 1
    for (lagValue in lagsTry) {
        # Run the linear model with no intercept, save, and increment
        lmResults[[n]] <- lm(depVar ~ indepVar:state + 0, 
                             data=helperMakeLagData(df, 
                                                    depName=depName, 
                                                    indepName=indepName, 
                                                    valueMetric=valueMetric, 
                                                    lagValue=lagValue
                                                    )
                             )
        n <- n + 1
    }
    
    # Find the best lag and coefficients
    dfResults <- tibble::tibble(lags=lagsTry, 
                                rsq=sapply(lmResults, FUN=function(x) summary(x)$r.squared)
                                )
    p1 <- dfResults %>%
        ggplot(aes(x=lags, y=rsq)) + 
        geom_point() + 
        labs(x="Lag", y="R-squared", title="R-squared vs. lag for aligning curves")
    print(p1)
    
    # Calculate the best lag and coefficients
    bestLag <- dfResults %>%
        filter(rsq==max(rsq)) %>%
        pull(lags)
    bestCoef <- coef(lmResults[[which(lagsTry==bestLag)]]) %>%
        as.data.frame() %>% 
        purrr::set_names("mult") %>%
        tibble::rownames_to_column("state") %>%
        mutate(state=str_replace(state, "indepVar:state", ""))
    
    # Plot the curves using the coefficients and lags
    bestDF <- helperMakeLagData(df, 
                                depName=depName, 
                                indepName=indepName, 
                                valueMetric=valueMetric, 
                                lagValue=bestLag
                                ) %>%
        filter(!is.na(indepVar)) %>%
        left_join(bestCoef, by="state") %>%
        mutate(pred=mult*indepVar)
    p2 <- bestDF %>%
        select(state, date, depVar, pred, mult) %>%
        pivot_longer(-c(state, date, mult)) %>%
        mutate(name=case_when(name=="depVar" ~ "Actual value", 
                              name=="pred" ~ "Predicted value\n(lag, mult)", 
                              TRUE ~ "Unknown Element"
                              )
               ) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(group=name, color=name)) + 
        geom_text(data=~filter(., date==max(date)), 
                  aes(x=date, y=+Inf, label=paste0("Multiplier: ", round(mult, 3))), 
                  hjust=1, 
                  vjust=1
                  ) +
        labs(x="", 
             y=yLabel, 
             title=paste0("Predicting ", 
                          textMetric, 
                          " based on lagged ", 
                          depLabel, 
                          " (best lag: ", 
                          bestLag, 
                          " days)"
                          ),
             subtitle="Best lag is based on highest correlation/R-squared, common across all facets"
             ) + 
        facet_wrap(~state) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        theme(axis.text.x = element_text(angle = 90)) + 
        scale_color_discrete("Metric")
    print(p2)
    
    # Return the key data
    list(bestLag=bestLag, bestCoef=bestCoef, bestDF=bestDF, lmResults=lmResults)
    
}

lmOut_ne <- alignCurves(ne_casedeath, valueMetric="vpm7", depName="CTP_deaths", lagsTry=0:20)

## Warning: Removed 3 row(s) containing missing values (geom_path).

# States with later deaths
so_casedeath <- plotStateMetric(stateData, 
                                yVal="vpm7", 
                                namesPlot=c("CTP_cases"),
                                namesSec=c("CTP_deaths"), 
                                keyStates=c("FL", "GA", "TX", "AZ"),
                                plotTitle="2020 coronavirus burden per million per day (select states)", 
                                plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                primYLab="Cases per million (7-day rolling mean)",
                                secYLab="Deaths per million (7-day rolling mean)",
                                facetFixed=TRUE, 
                                popData=usafPrepped$usafDemo,
                                returnData=TRUE
                                )
## 
## Will scale by: 0.02075921

so_casedeath
## # A tibble: 1,880 x 4
##    state date       name         vpm7
##    <chr> <date>     <chr>       <dbl>
##  1 AZ    2020-03-04 CTP_cases NA     
##  2 AZ    2020-03-05 CTP_cases NA     
##  3 AZ    2020-03-06 CTP_cases NA     
##  4 AZ    2020-03-07 CTP_cases  0.0837
##  5 AZ    2020-03-08 CTP_cases  0.146 
##  6 AZ    2020-03-09 CTP_cases  0.146 
##  7 AZ    2020-03-10 CTP_cases  0.126 
##  8 AZ    2020-03-11 CTP_cases  0.146 
##  9 AZ    2020-03-12 CTP_cases  0.146 
## 10 AZ    2020-03-13 CTP_cases  0.272 
## # ... with 1,870 more rows
lmOut_so <- alignCurves(so_casedeath, valueMetric="vpm7", depName="CTP_deaths", lagsTry=0:20)

## Warning: Removed 3 row(s) containing missing values (geom_path).

The states with later disease burden seem to have 1) longer lag time from case to death; and 2) lower rate of deaths per case. Both findings would be consistent with greater testing leading to both earlier diagnosis and increased diagnosis of less severe cases.

The process can all be run together with a main function:

createAndAlignCurves <- function(df, 
                                 yVal, 
                                 namesPlot, 
                                 keyStates,
                                 lagValueMetric,
                                 lagDepName,
                                 namesSec=NULL,
                                 scaleSec=NULL,
                                 plotTitle=NULL,
                                 plotSub=NULL,
                                 plotCaption=NULL,
                                 primYLab=NULL,
                                 secYLab="Caution, different metric and scale",
                                 facetFixed=TRUE,
                                 mapper=varMapper, 
                                 combStates=vector("character", 0), 
                                 popData=NULL, 
                                 yValPerCap = (yVal %in% c("vpm", "vpm7")), 
                                 printPlot=TRUE, 
                                 ...
                                 ) {
    
    # FUNCTION ARGUMENTS:
    # df: data frame with integrated state data
    # yVal: column to use for the yValues
    # namesPlot: the values of column 'name' to be kept and plotted
    # keyStates: states to be included
    #            if more than one state is passed, facets will be created
    # lagValueMetric: the metric to be used for checking lags (typically 'vpm7')
    # lagDepName: dependent variable (records in column 'name') to be used for the lagging process
    # namesSec: names to be plotted on a secondary y-axes
    # scaleSec: scale to be used for the secondary axis 
    #           namesSec/scaleSec should be similar in magnitude to namesPlot
    # plotTitle: plot title to be used (NULL means none)
    # plotSub: plot subtitle to be used (NULL means none)
    # plotCaption: plot caption to be used (NULL means none)
    # primYLab: primary y label (NULL means use mapper)
    # secYLab: secondary y label (default is "Caution, different metric and scale")
    # facetFixed: boolean, if TRUE scales="fixed", if FALSE scales="free_y"
    #             only relevant if length(keyStates) > 1
    # mapper: mapping file for variable names to descriptive names
    # combStates: states that should be combined together for plotting (named vector, c("state"="newName"))
    # popData: a population file for combining states
    # yValPerCap: boolean, is the y-value of type per-capita?
    # printPlot: boolean, whether to print the plots
    # ...: other arguments to be passed to alignCurves()

    # Create a frame to be used by the lagging process
    tempMetrics <- plotStateMetric(df, 
                                   yVal=yVal, 
                                   namesPlot=namesPlot,
                                   keyStates=keyStates,
                                   namesSec=namesSec, 
                                   scaleSec=scaleSec,
                                   plotTitle=plotTitle, 
                                   plotSub=plotSub, 
                                   plotCaption=plotCaption,
                                   primYLab=primYLab,
                                   secYLab=secYLab,
                                   facetFixed=facetFixed,
                                   mapper=mapper,
                                   combStates=combStates,
                                   popData=popData,
                                   yValPerCap=yValPerCap,
                                   printPlot=printPlot,
                                   returnData=TRUE  # the data must be returned for the next function
                                   )

    # Run the lagging process    
    tempLM <- alignCurves(tempMetrics, valueMetric=lagValueMetric, depName=lagDepName, ...)
    
    # Return the key values
    list(dfList=tempMetrics, lmList=tempLM)
    
}


# Example for northeastern states
neCurveList <- createAndAlignCurves(stateData, 
                                    yVal="vpm7", 
                                    namesPlot=c("CTP_cases"),
                                    namesSec=c("CTP_deaths"), 
                                    keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME", "DE", "DC"),
                                    combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", 
                                                 "NH"="N NE", "VT"="N NE", "ME"="N NE", 
                                                 "NY"="NY/NJ", "NJ"="NY/NJ", 
                                                 "DE"="DE/DC", "DC"="DE/DC"
                                                 ),
                                    plotTitle="2020 coronavirus burden per million per day (select states)", 
                                    plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                    primYLab="Cases per million (7-day rolling mean)",
                                    secYLab="Deaths per million (7-day rolling mean)",
                                    facetFixed=TRUE, 
                                    popData=usafPrepped$usafDemo, 
                                    printPlot=TRUE,
                                    lagValueMetric="vpm7", 
                                    lagDepName="CTP_deaths", 
                                    lagsTry=0:30
                                    )
## `summarise()` regrouping output by 'state', 'date' (override with `.groups` argument)
## 
## Will scale by: 0.07975869

## Warning: Removed 3 row(s) containing missing values (geom_path).

# Example for midwestern states
mwCurveList <- createAndAlignCurves(stateData, 
                                    yVal="vpm7", 
                                    namesPlot=c("CTP_cases"),
                                    namesSec=c("CTP_deaths"), 
                                    keyStates=c("OH", "MI", "IN", "IL"),
                                    plotTitle="2020 coronavirus burden per million per day (select states)", 
                                    plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                    primYLab="Cases per million (7-day rolling mean)",
                                    secYLab="Deaths per million (7-day rolling mean)",
                                    facetFixed=TRUE, 
                                    popData=usafPrepped$usafDemo, 
                                    printPlot=TRUE,
                                    lagValueMetric="vpm7", 
                                    lagDepName="CTP_deaths", 
                                    lagsTry=0:30
                                    )
## 
## Will scale by: 0.04523392

## Warning: Removed 3 row(s) containing missing values (geom_path).

The midwestern states show the need for the lag value and multiplier to change over time. When a state has a single epidemic peak, such as NY/NJ, metrics can align around values as of the peak. When a state has multiple peaks or smoldering disease (on the x or y metric), then a single value for lag with a single multiplier fails to align the curves.

As an example, the 5.6% deaths vs. cases estimate for Michigan is much too low in April-May and much too high in August-October.

Suppose that separate multipliers are allowed in every month (a likely driver of overfit, but potentially helpful for assessing trends):

# Extract the Michigan data
miData <- mwCurveList$dfList %>%
    filter(state=="MI", !is.na(vpm7)) %>%
    pivot_wider(names_from="name", values_from="vpm7")

# Suppose that a 10-day lag and 20-day lag are calculated, and that lag grows linearly from April 1 -Sep 30
fullTime <- as.integer(as.Date("2020-09-30")-as.Date("2020-03-31"))
miData <- miData %>%
    arrange(state, date) %>%
    group_by(state) %>%
    mutate(lag10=lag(CTP_cases, 10), 
           lag20=lag(CTP_cases, 20), 
           pct10=pmin(pmax(as.integer(as.Date("2020-09-30")-date)/fullTime, 0), 1), 
           indepVar=ifelse(is.na(lag10), NA, pct10*lag10 + (1-pct10)*ifelse(is.na(lag20), 0, lag20)), 
           mon=factor(month.abb[lubridate::month(date)], levels=month.abb)
           ) %>%
    ungroup()

# Regression for Michigan data
miLM <- lm(CTP_deaths ~ indepVar:mon + 0, data=miData)
summary(miLM)
## 
## Call:
## lm(formula = CTP_deaths ~ indepVar:mon + 0, data = miData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35211 -0.28842 -0.03148  0.32132  2.27188 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## indepVar:monMar 0.089415   0.002944  30.375  < 2e-16 ***
## indepVar:monApr 0.121001   0.001069 113.146  < 2e-16 ***
## indepVar:monMay 0.064522   0.001539  41.934  < 2e-16 ***
## indepVar:monJun 0.040358   0.003812  10.588  < 2e-16 ***
## indepVar:monJul 0.017567   0.002592   6.776 1.18e-10 ***
## indepVar:monAug 0.013901   0.001574   8.830 3.81e-16 ***
## indepVar:monSep 0.014323   0.001501   9.544  < 2e-16 ***
## indepVar:monOct 0.019051   0.001591  11.975  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6312 on 214 degrees of freedom
##   (10 observations deleted due to missingness)
## Multiple R-squared:  0.9868, Adjusted R-squared:  0.9863 
## F-statistic:  1994 on 8 and 214 DF,  p-value: < 2.2e-16
# Plot of deaths vs cases rate by month
coef(miLM) %>%
    as.data.frame() %>%
    purrr::set_names("CFR") %>%
    tibble::rownames_to_column("mon") %>%
    mutate(mon=factor(stringr::str_replace(mon, pattern="indepVar:mon", replacement=""), levels=month.abb)) %>%
    ggplot(aes(x=mon, y=CFR)) + 
    geom_col(fill="lightblue") + 
    geom_text(aes(y=CFR/2, label=paste0(round(100*CFR, 1), "%"))) + 
    labs(x="", 
         y="Deaths as percentage of lagged cases", 
         title="Michigan deaths vs. lagged cases", 
         subtitle="Assumed 10-day lag in March interpolated to 20-day lag in September", 
         caption="Linear model coefficients on lagged data with no intercept used to estimate CFR"
         )

As expected, CFR is very different in the early peak and in the later forms of disease. The above is converted to a function so that other states can be assessed (again using a rule-of-thumb rather than calculation for lag time periods):

assessStateCFR <- function(lst, 
                           keyStates, 
                           depVar, 
                           indepVar,
                           depTitleName, 
                           indepTitleName,
                           keyMetric="vpm7", 
                           lagEarlyDate=as.Date("2020-03-31"), 
                           lagLateDate=as.Date("2020-09-30"), 
                           lagEarlyValue=10, 
                           lagLateValue=20
                           ) {
    
    # FUNCTION ARGUMENTS:
    # lst: A list such as produced by createAndAlignCurves()
    # keyStates: The key states to be extracted from the list
    # depVar: the dependent variable
    # indepVar: the independent variable
    # depTitleName: the name for the dependent variable in the title
    # indepTitleName: the name for the independent variable in the plot title
    # keyMetric: the name of the key metric that is being assessed
    # lagEarlyDate: the date for the earliest lagging calculation (dates before this will be at lagEarlyValue)
    # lagLateDate: the date for the latest lagging calculation (dates after this will be at lagLateValue)
    # lagEarlyValue: the value for lag on lagEarlyDate, will be linearly interpolated to lagLateValue/Date
    # lagLateValue: the value for lag on lagLateDate, will be linearly interpolated from lagEarlyValue/Date
    
    # Extract the data for keyStates
    df <- lst[["dfList"]] %>%
        filter(state %in% keyStates, !is.na(get(keyMetric))) %>%
        pivot_wider(names_from="name", values_from=keyMetric)

    # Apply the assumed lagging information
    fullTime <- as.integer(lagLateDate-lagEarlyDate)
    df <- df %>%
        arrange(state, date) %>%
        group_by(state) %>%
        mutate(lag10=lag(get(indepVar), lagEarlyValue), 
               lag20=lag(get(indepVar), lagLateValue), 
               pct10=pmin(pmax(as.integer(lagLateDate-date)/fullTime, 0), 1), 
               x=ifelse(is.na(lag10), NA, pct10*lag10 + (1-pct10)*ifelse(is.na(lag20), 0, lag20)), 
               y=get(depVar),
               mon=factor(month.abb[lubridate::month(date)], levels=month.abb)
               ) %>%
        filter(!is.na(x)) %>%
        ungroup()

    # Regression for data from keyStates
    if (length(keyStates) > 1) stateLM <- lm(y ~ x:mon:state + 0, data=df) 
    else stateLM <- lm(y ~ x:mon + 0, data=df)
    
    # Add the predicted value to df
    df <- df %>%
        mutate(pred=predict(stateLM))

    # Plot of curve overlaps
    p1 <- df %>%
        select(state, date, y, pred) %>%
        pivot_longer(-c(state, date)) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(color=c("pred"="Predicted", "y"="Actual")[name], group=name)) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        labs(x="", 
             y=stringr::str_to_title(depTitleName), 
             title=paste0("Predicted vs. actual ", depTitleName)
             ) +
        scale_color_discrete("Metric") +
        facet_wrap(~state)
    print(p1)
    
    # Plot of rate by month
    p2 <- coef(stateLM) %>%
        as.data.frame() %>%
        purrr::set_names("CFR") %>%
        tibble::rownames_to_column("monState") %>%
        mutate(mon=factor(stringr::str_replace_all(monState, pattern="x:mon|:state.+", replacement=""), 
                          levels=month.abb
                          ), 
               state=if (length(keyStates)==1) keyStates 
                     else stringr::str_replace_all(monState, pattern="x:mon[A-Za-z]{3}:state", replacement="")
               ) %>%
        ggplot(aes(x=mon, y=CFR)) + 
        geom_col(fill="lightblue") + 
        geom_text(aes(y=CFR/2, label=paste0(round(100*CFR, 1), "%"))) + 
        labs(x="", 
             y=paste0(stringr::str_to_title(depTitleName), " as percentage of lagged ", indepTitleName), 
             title=paste0(stringr::str_to_title(depTitleName), 
                          " vs. lagged ", 
                          indepTitleName, 
                          " in state(s): ", 
                          paste0(keyStates, collapse=", ")
                          ), 
             subtitle=paste0("Assumed ", 
                             lagEarlyValue, 
                             "-day lag on ", 
                             lagEarlyDate,
                             " interpolated to ", 
                             lagLateValue,
                             "-day lag on ", 
                             lagLateDate
                             ), 
             caption="Linear model coefficients on lagged data with no intercept used to estimate percentage"
             ) + 
        facet_wrap(~state)
    print(p2)

    # Return the data frame
    df
    
}

# Deaths vs. cases in Michigan
assessStateCFR(mwCurveList, 
               keyStates="MI", 
               depVar="CTP_deaths", 
               indepVar="CTP_cases", 
               depTitleName="deaths", 
               indepTitleName="cases"
               )
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(keyMetric)` instead of `keyMetric` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

## # A tibble: 222 x 11
##    state date       CTP_cases CTP_deaths lag10 lag20 pct10     x      y mon  
##    <chr> <date>         <dbl>      <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <fct>
##  1 MI    2020-03-14      45.1     0.0288  2.51    NA     1  2.51 0.0288 Mar  
##  2 MI    2020-03-15      54.8     0.0720  3.41    NA     1  3.41 0.0720 Mar  
##  3 MI    2020-03-16      61.6     0.144   4.97    NA     1  4.97 0.144  Mar  
##  4 MI    2020-03-17      69.9     0.187   6.88    NA     1  6.88 0.187  Mar  
##  5 MI    2020-03-18      76.2     0.331   9.40    NA     1  9.40 0.331  Mar  
##  6 MI    2020-03-19      81.1     0.518  12.9     NA     1 12.9  0.518  Mar  
##  7 MI    2020-03-20      88.8     0.821  17.6     NA     1 17.6  0.821  Mar  
##  8 MI    2020-03-21      93.5     1.21   22.2     NA     1 22.2  1.21   Mar  
##  9 MI    2020-03-22      98.0     1.53   27.9     NA     1 27.9  1.53   Mar  
## 10 MI    2020-03-23     104.      2.15   36.3     NA     1 36.3  2.15   Mar  
## # ... with 212 more rows, and 1 more variable: pred <dbl>
# Deaths vs. cases in NY/NJ and S NE
assessStateCFR(neCurveList, 
               keyStates=c("NY/NJ", "S NE"), 
               depVar="CTP_deaths", 
               indepVar="CTP_cases", 
               depTitleName="deaths", 
               indepTitleName="cases"
               )

## Warning: Removed 2 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_text).

## # A tibble: 494 x 12
##    state date          pop CTP_cases CTP_deaths lag10 lag20 pct10     x     y
##    <chr> <date>      <dbl>     <dbl>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 NY/NJ 2020-02-23 8.96e6    0               0     0    NA     1     0     0
##  2 NY/NJ 2020-02-24 8.96e6    0               0     0    NA     1     0     0
##  3 NY/NJ 2020-02-25 8.96e6    0               0     0    NA     1     0     0
##  4 NY/NJ 2020-02-26 8.96e6    0               0     0    NA     1     0     0
##  5 NY/NJ 2020-02-27 8.96e6    0               0     0    NA     1     0     0
##  6 NY/NJ 2020-02-28 8.96e6    0               0     0    NA     1     0     0
##  7 NY/NJ 2020-02-29 8.96e6    0               0     0    NA     1     0     0
##  8 NY/NJ 2020-03-01 8.96e6    0               0     0    NA     1     0     0
##  9 NY/NJ 2020-03-02 8.96e6    0.0159          0     0    NA     1     0     0
## 10 NY/NJ 2020-03-03 8.96e6    0.0159          0     0    NA     1     0     0
## # ... with 484 more rows, and 2 more variables: mon <fct>, pred <dbl>

A few findings include:

  1. Allowing the rate to change over time significantly improves fit for Michigan (and likely for other states that do not have solely a single peak epidemic)
  2. Assuming a lag appears sub-optimal for NY/NJ and S NE where the peak of the epidemic is better suited for finding a lag through data rather than a heuristic

Next steps include 1) finding the lag automatically while allowing for change over time; and 2) smoothing monthly CFR borders that drives spikes/cliffs in predicted values at the month end/start.

The function assessStateCFR() is updated to allow for automatically finding the best early and/or late lag:

# Updated for automatic lag time assessment
assessStateCFR <- function(lst, 
                           keyStates, 
                           depVar, 
                           indepVar,
                           depTitleName, 
                           indepTitleName,
                           keyMetric="vpm7", 
                           lagEarlyDate=as.Date("2020-03-31"), 
                           lagMidDate=NULL,
                           lagLateDate=as.Date("2020-09-30"), 
                           lagEarlyValue=10, 
                           lagLateValue=20, 
                           lagsTry=0:30
                           ) {
    
    # FUNCTION ARGUMENTS:
    # lst: A list such as produced by createAndAlignCurves()
    # keyStates: The key states to be extracted from the list
    # depVar: the dependent variable
    # indepVar: the independent variable
    # depTitleName: the name for the dependent variable in the title
    # indepTitleName: the name for the independent variable in the plot title
    # keyMetric: the name of the key metric that is being assessed
    # lagEarlyDate: the date for the earliest lagging calculation (dates before this will be at lagEarlyValue)
    # lagMidDate: if lags are found from data, what midpoint should be used to split data as early vs late?
    #             NULL means midway between lagEarlyDate and lagLateDate
    # lagLateDate: the date for the latest lagging calculation (dates after this will be at lagLateValue)
    # lagEarlyValue: the value for lag on lagEarlyDate, will be linearly interpolated to lagLateValue/Date
    #                NULL means calculate from data and may differ by state
    # lagLateValue: the value for lag on lagLateDate, will be linearly interpolated from lagEarlyValue/Date
    #               NULL means estimate from data and may differ by state
    # lagsTry: the values for lag to be attempted if lageEarlyValue and/or lagLateValue is NULL
    
    # Extract the data for keyStates
    df <- lst[["dfList"]] %>%
        filter(state %in% keyStates, !is.na(get(keyMetric))) %>%
        pivot_wider(names_from="name", values_from=keyMetric)

    # Function for finding lag time correlations
    helperLagCor <- function(lt, lf, dp, id) {
        lf %>%
            group_by(state) %>%
            mutate(y=get(dp), x=lag(get(id), lt)) %>%
            summarize(p=cor(x, y, use="complete.obs")) %>%
            ungroup() %>%
            mutate(lag=lt)
    }
    
    # Middle date for splitting data
    if (is.null(lagMidDate)) lagMidDate <- mean(c(lagEarlyDate, lagLateDate))
    
    # Get the early lags from the data
    eLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date<=lagMidDate), dp=depVar, id=indepVar) %>%
        group_by(state) %>%
        filter(p==max(p)) %>%
        filter(row_number()==1) %>%
        ungroup() %>%
        select(state, earlyLag=lag)
    
    # Get the late lags from the data
    lLag <- map_dfr(.x=lagsTry, .f=helperLagCor, lf=filter(df, date>lagMidDate), dp=depVar, id=indepVar) %>%
        group_by(state) %>%
        filter(p==max(p)) %>%
        filter(row_number()==1) %>%
        ungroup() %>%
        select(state, lateLag=lag)
    
    # Create the full lag frame, including substituting the fixed value(s) if provided
    lagFrame <- eLag %>%
        inner_join(lLag, by="state")
    if (!is.null(lagEarlyValue)) lagFrame <- lagFrame %>% mutate(earlyLag=lagEarlyValue)
    if (!is.null(lagLateValue)) lagFrame <- lagFrame %>% mutate(lateLag=lagLateValue)
    print(lagFrame)
    
    # Apply the assumed lagging information
    fullTime <- as.integer(lagLateDate-lagEarlyDate)
    df <- df %>%
        left_join(lagFrame, by="state") %>%
        arrange(state, date) %>%
        group_by(state) %>%
        mutate(eLag=lag(get(indepVar), mean(earlyLag)), 
               lLag=lag(get(indepVar), mean(lateLag)), 
               pctEarly=pmin(pmax(as.integer(lagLateDate-date)/fullTime, 0), 1), 
               x=ifelse(is.na(eLag), NA, pctEarly*eLag + (1-pctEarly)*ifelse(is.na(lLag), 0, lLag)), 
               y=get(depVar),
               mon=factor(month.abb[lubridate::month(date)], levels=month.abb)
               ) %>%
        filter(!is.na(x)) %>%
        ungroup()

    # Regression for data from keyStates
    if (length(keyStates) > 1) stateLM <- lm(y ~ x:mon:state + 0, data=df) 
    else stateLM <- lm(y ~ x:mon + 0, data=df)
    
    # Add the predicted value to df
    df <- df %>%
        mutate(pred=predict(stateLM))

    # Plot of curve overlaps
    p1 <- df %>%
        select(state, date, y, pred) %>%
        pivot_longer(-c(state, date)) %>%
        ggplot(aes(x=date, y=value)) + 
        geom_line(aes(color=c("pred"="Predicted", "y"="Actual")[name], group=name)) + 
        scale_x_date(date_breaks="1 month", date_labels="%b") + 
        labs(x="", 
             y=stringr::str_to_title(depTitleName), 
             title=paste0("Predicted vs. actual ", depTitleName)
             ) +
        scale_color_discrete("Metric") +
        facet_wrap(~state)
    print(p1)
    
    # Plot of rate by month
    p2 <- coef(stateLM) %>%
        as.data.frame() %>%
        purrr::set_names("CFR") %>%
        tibble::rownames_to_column("monState") %>%
        mutate(mon=factor(stringr::str_replace_all(monState, pattern="x:mon|:state.+", replacement=""), 
                          levels=month.abb
                          ), 
               state=if (length(keyStates)==1) keyStates 
                     else stringr::str_replace_all(monState, pattern="x:mon[A-Za-z]{3}:state", replacement="")
               ) %>%
        left_join(lagFrame, by="state") %>%
        ggplot(aes(x=mon, y=CFR)) + 
        geom_col(fill="lightblue") + 
        geom_text(aes(y=CFR/2, label=paste0(round(100*CFR, 1), "%"))) +
        geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]), 
                  aes(x=-Inf, y=Inf, label=paste0("Early Lag: ", earlyLag)), 
                  hjust=0, 
                  vjust=1
                  ) + 
        geom_text(data=~filter(., mon==month.abb[lubridate::month(lagMidDate)]), 
                  aes(x=Inf, y=Inf, label=paste0("Late Lag: ", lateLag)), 
                  hjust=1, 
                  vjust=1
                  ) + 
        labs(x="", 
             y=paste0(stringr::str_to_title(depTitleName), " as percentage of lagged ", indepTitleName), 
             title=paste0(stringr::str_to_title(depTitleName), 
                          " vs. lagged ", 
                          indepTitleName, 
                          " in state(s): ", 
                          paste0(keyStates, collapse=", ")
                          ), 
             subtitle=paste0("Assumed early lag on ", 
                             lagEarlyDate,
                             " interpolated to late lag on ", 
                             lagLateDate
                             ), 
             caption="Linear model coefficients on lagged data with no intercept used to estimate percentage"
             ) + 
        facet_wrap(~state)
    print(p2)

    # Return the data frame
    df
    
}

# Deaths vs. cases in Michigan
mwOut <- assessStateCFR(mwCurveList, 
                        keyStates=c("MI", "IL", "IN"), 
                        depVar="CTP_deaths", 
                        indepVar="CTP_cases", 
                        depTitleName="deaths", 
                        indepTitleName="cases", 
                        lagEarlyValue=NULL,
                        lagLateValue=NULL
                        )
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 IN           2       5
## 2 IL           5       0
## 3 MI          10       0

# Deaths vs. cases in NY/NJ and S NE
neOut <- assessStateCFR(neCurveList, 
                        keyStates=c("NY/NJ", "S NE"), 
                        depVar="CTP_deaths", 
                        indepVar="CTP_cases", 
                        depTitleName="deaths", 
                        indepTitleName="cases", 
                        lagEarlyValue=NULL,
                        lagLateValue=NULL
                        )
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 NY/NJ        6      30
## 2 S NE         7       0

## Warning: Removed 2 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_text).

The ability to have data-driven lags by state improves the fit. Next steps are to work on smoothing the scaling factor over time.

Suppose that the full process is run to look at time from hospitalization to death:

# Example for midwestern states
mwHospDeathList <- createAndAlignCurves(stateData, 
                                        yVal="vpm7", 
                                        namesPlot=c("CTP_hosp"),
                                        namesSec=c("CTP_deaths"), 
                                        keyStates=c("OH", "MI", "IN", "IL"),
                                        plotTitle="2020 coronavirus burden per million per day", 
                                        plotSub="Hospitalized on main y-axis, deaths on secondary y-axis", 
                                        primYLab="Hospitalized per million (7-day rolling mean)",
                                        secYLab="Deaths per million (7-day rolling mean)",
                                        facetFixed=TRUE, 
                                        popData=usafPrepped$usafDemo, 
                                        printPlot=TRUE,
                                        lagValueMetric="vpm7", 
                                        lagDepName="CTP_deaths", 
                                        lagsTry=0:30
                                        )
## 
## Will scale by: 0.04114457

## Warning: Removed 3 row(s) containing missing values (geom_path).

# Deaths vs. cases in midwestern states
mwHospOut <- assessStateCFR(mwHospDeathList, 
                        keyStates=c("OH", "MI", "IL", "IN"), 
                        depVar="CTP_deaths", 
                        indepVar="CTP_hosp", 
                        depTitleName="deaths", 
                        indepTitleName="hospitalized", 
                        lagEarlyValue=NULL,
                        lagLateValue=NULL
                        )
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 MI           3       0
## 2 IN           5       3
## 3 OH           7      30
## 4 IL           9       7

## Warning: Removed 2 rows containing missing values (position_stack).
## Warning: Removed 2 rows containing missing values (geom_text).

Total number in the hospital (note that this is not new hospitalizations) appears to be a fairly strong predictor of coronavirus deaths in the next week or so.

Suppose that a goal is to look at correlations between two metrics using 1) different lags, and 2) different windows of time. Suppose that there are two vectors of equal length, perfectly correlated on a lagged basis:

v1 <- sin((0:359) * pi / 180)
v2 <- cos((0:359) * pi / 180)
tibble::tibble(x=0:359, v1=v1, v2=v2) %>%
    pivot_longer(-x) %>%
    ggplot(aes(x=x, y=value)) + 
    geom_line(aes(group=name, color=name))

Correlations can be explored on various lags of the data:

tibble::tibble(lags=0:180) %>%
    mutate(p=sapply(lags, FUN=function(x) cor(v1, lag(v2, x), use="complete.obs"))) %>%
    ggplot(aes(x=lags, y=p)) + 
    geom_point() + 
    labs(x="Lagged degrees", y="Correlation", title="Correlation of sin and cos by lagged degrees")

As expected, the curves are perfectly correlated with a 90-degree lag. Suppose another goal is to find the best lag (lag producing the highest correlation) based on only a “window” of the data. Essentially, for each lag, run a subset of [start:(start+window-1)] for the relevant vectors. A function is written:

lagVectorWindows <- function(v1, 
                             v2, 
                             lagsTry=0:30, 
                             windowSize=30, 
                             minNoNA=ceiling(windowSize/2), 
                             updateStatus=FALSE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # v1: the first vector, which will be used 'as is'
    # v2: the second vector, which will be lagged by various values for lagsTry
    # lagsTry: the values for lag that will be used in cor(v1, lag(v2, lag))
    # windowSize: the size of the window to use in taking snippets of v1 and lagged v2
    # minNoNA: minimum number of non-NA lagged values needed to calculate a correlation
    # updateStates: whether to print which window is being worked on
    
    # Find the list of all possible window start points
    windowStarts <- 1:(length(v1)-windowSize+1)
    
    # Helper function to create a frame of correlations
    helperCorr <- function(s) {
        
        # Create the end point for the vector
        e <- s + windowSize - 1
        
        # Announce the start
        if (updateStatus) cat("\nProcessing window starting at:", s)
        
        # Create the correlations tibble for all values of lag, and return
        tibble::tibble(startWindow=s, endWindow=e, lags=lagsTry) %>%
            mutate(na1=sum(is.na(v1[s:e])), 
                   na2=sapply(lags, FUN=function(x) sum(is.na(lag(v2, x)[s:e]))), 
                   p=sapply(lags, 
                            FUN=function(x) { 
                                ifelse(sum(!is.na(lag(v2, x)[s:e])) < minNoNA,
                                       NA, 
                                       cor(v1[s:e], lag(v2, x)[s:e], use="complete.obs")
                                       ) 
                                } 
                            )
                   )
    }

    # Bind the correlations frames and return
    map_dfr(windowStarts, .f=helperCorr)
    
}

tmpDB <- lagVectorWindows(v1=v1, v2=v2)

tmpDB %>%
    filter(!is.na(p)) %>%
    ggplot(aes(x=startWindow, y=lags)) + 
    geom_tile(aes(fill=p)) + 
    scale_fill_continuous(low="white", high="lightgreen") + 
    labs(x="Starting at degrees", y="Lagged by", title="Correlations based on lag and starting window")

The correlations are (essentially) 1 when the curves are both going up/down at the same time, and are (essentially) -1 when one curve rises at the same time that the other curve falls. This is driven by 30 degrees being a much too small window. Suppose instead that windows of 90 degrees are applied:

tmpDB <- lagVectorWindows(v1=v1, v2=v2, windowSize=90, lagsTry=0:180)

tmpDB %>%
    filter(!is.na(p)) %>%
    ggplot(aes(x=startWindow, y=lags)) + 
    geom_tile(aes(fill=p)) + 
    scale_fill_gradient2(midpoint=0) + 
    labs(x="Starting at degrees", 
         y="Lagged by degrees", 
         title="Correlations based on lag and starting window", 
         subtitle="Window of 90 degrees, lags of 0 to 180 degrees"
         ) + 
    geom_hline(yintercept=90, lty=2)

tmpDB %>%
    filter(!is.na(p)) %>%
    mutate(lags=round(lags/5, 0)*5) %>%
    ggplot(aes(x=factor(lags), y=p)) + 
    geom_boxplot(fill="lightblue") + 
    labs(x="Lag, rounded to nearest 5 degrees", y="Correlation", title="Correlations by lag value")

tmpDB %>%
    filter(!is.na(p)) %>%
    group_by(startWindow) %>%
    filter(p==max(p)) %>%
    ungroup() %>%
    ggplot(aes(x=startWindow, y=lags)) + 
    geom_point() +
    labs(x="Window starting degrees", y="Lag with highest correlation", title="Best lag by starting window")

As expected, a lag of 90 degrees always produces a high correlation. Since the curves are periodic, there is often a strong correlation at other values of lag also, though these are dependent on the starting point for the window and can also produce strong negative correlations.

The lag producing the highest correlation is consistently the closest allowable value to 90 degrees, which is known to be the answer for sin and cos.

This process can be leveraged to start looking at various coronavirus disease burden curves, to see what values of lag produce the best correlations given a specific window of time. Exploring this is the next step.

Suppose that the lag methods are run for the state of MI:

# Extract the Michigan data
miData <- mwCurveList$dfList %>%
    filter(state=="MI", !is.na(vpm7), date>="2020-02-01", date<="2020-10-15") %>%
    arrange(date, name)
    
# Extract cases and deaths
miCases <- miData %>% filter(name=="CTP_cases") %>% pull(vpm7)
miDeaths <- miData %>% filter(name=="CTP_deaths") %>% pull(vpm7)

# Confirm that dates are the same for both vectors
miDates <- miData %>% filter(name=="CTP_cases") %>% pull(date)
if (!all.equal(miDates, miData %>% filter(name=="CTP_deaths") %>% pull(date))) stop("\nDate mismatch\n")

# Find the lags in the MI data
miLags <- lagVectorWindows(miDeaths, miCases, lagsTry=0:30, windowSize=56)

# Box plot for correlations by lag
miLags %>%
    filter(!is.na(p)) %>%
    ggplot(aes(x=factor(lags), y=p)) + 
    geom_boxplot(fill="lightblue") + 
    labs(x="Lag (days)", y="Correlation", title="Box plot of correlation by lag (days)")

At a glance, every lag value is roughly equal in correlation, potentially driven by the one big early spike where there should be a strong correlation at a defined lag:

miLags %>%
    filter(!is.na(p)) %>%
    group_by(startWindow) %>%
    filter(p==max(p)) %>%
    ggplot(aes(x=miDates[startWindow], y=lags)) + 
    geom_point(aes(size=p)) + 
    labs(x="Window start date", y="Best lag (days)", title="Best lag by window starting date") + 
    scale_size_continuous("p at best lag") + 
    scale_x_date(date_breaks="1 month", date_labels="%b")

miLags %>%
    filter(!is.na(p)) %>%
    ggplot(aes(x=miDates[startWindow], y=lags)) + 
    geom_tile(aes(fill=p)) + 
    labs(x="Window start date", y="Lag (days)", title="Lag (days) by window starting date") + 
    scale_color_continuous("p at lag") + 
    scale_x_date(date_breaks="1 month", date_labels="%b")

During the time when the spike in Michigan was growing, average “best lag” was in the 10-15 day range. A similar finding is observed in the area of the downslope of the spike in Michigan. The pattern that lag tends to climb linearly may be a feature of Michigan or of the methodology.

The process is converted to functional form:

# Function to assess correlations by lag and window by state
stateCorr <- function(lst, 
                      keyState, 
                      met="vpm7", 
                      v1Name="CTP_deaths", 
                      v2Name="CTP_cases", 
                      windowSize=42
                      ) {
    
    # FUNCTION ARGUMENTS:
    # lst: the processed list
    # keyState: the state of interest
    # met: the metric of interest
    # v1Name: the name of the first vector
    # v2Name: the name of the second vector
    # windowSize: number of days in the window
    
    # Extract the data for the key State
    df <- lst[["dfList"]] %>%
        filter(state %in% keyState, !is.na(get(met))) %>%
        arrange(state, date, name)
    
    # Get the minimum date that is common to both
    minDate <- df %>%
        group_by(name) %>%
        summarize(date=min(date)) %>%
        pull(date) %>%
        max()
    
    # Extract v1 and v2
    v1 <- df %>% 
        filter(name==v1Name, date>=minDate) %>% 
        pull(met)
    v2 <- df %>% 
        filter(name==v2Name, date>=minDate) %>% 
        pull(met)

    # Confirm that dates are the same for both vectors
    dfDates1 <- df %>% filter(name==v1Name, date>=minDate) %>% pull(date)
    dfDates2 <- df %>% filter(name==v2Name, date>=minDate) %>% pull(date)
    str(dfDates1) %>% print()
    str(dfDates2) %>% print()
    if (!all.equal(dfDates1, dfDates2)) stop("\nDate mismatch\n")

    # Find the lags in the data
    dfLags <- lagVectorWindows(v1, v2, lagsTry=0:30, windowSize=windowSize) %>%
        mutate(windowStartDate=dfDates1[startWindow])

    # Boxplot of correlations by lag
    p1 <- dfLags %>%
        filter(!is.na(p)) %>%
        ggplot(aes(x=factor(lags), y=p)) + 
        geom_boxplot(fill="lightblue") + 
        labs(x="Lag (days)", y="Correlation", title="Box plot of correlation by lag (days)")
    print(p1)

    # Plot of best lags by starting date
    p2 <- dfLags %>%
        filter(!is.na(p)) %>%
        group_by(startWindow) %>%
        filter(p==max(p)) %>%
        ggplot(aes(x=windowStartDate, y=lags)) + 
        geom_point(aes(size=p)) + 
        labs(x="Window start date", y="Best lag (days)", title="Best lag by window starting date") + 
        scale_size_continuous("p at best lag") + 
        scale_x_date(date_breaks="1 month", date_labels="%b")
    print(p2)
    
    # Plot of correlations by lag
    p3 <- dfLags %>%
        filter(!is.na(p)) %>%
        ggplot(aes(x=windowStartDate, y=lags)) + 
        geom_tile(aes(fill=p)) + 
        labs(x="Window start date", y="Lag (days)", title="Lag (days) by window starting date") + 
        scale_color_continuous("p at lag") + 
        scale_x_date(date_breaks="1 month", date_labels="%b")
    print(p3)
    
    # Return dfLags
    dfLags
    
}


miLagData <- stateCorr(mwCurveList, keyState="MI")
## `summarise()` ungrouping output (override with `.groups` argument)
##  Date[1:232], format: "2020-03-04" "2020-03-05" "2020-03-06" "2020-03-07" "2020-03-08" ...
## NULL
##  Date[1:232], format: "2020-03-04" "2020-03-05" "2020-03-06" "2020-03-07" "2020-03-08" ...
## NULL

ilLagData <- stateCorr(mwCurveList, keyState="IL")
## `summarise()` ungrouping output (override with `.groups` argument)
##  Date[1:229], format: "2020-03-07" "2020-03-08" "2020-03-09" "2020-03-10" "2020-03-11" ...
## NULL
##  Date[1:229], format: "2020-03-07" "2020-03-08" "2020-03-09" "2020-03-10" "2020-03-11" ...
## NULL

nynjLagData <- stateCorr(neCurveList, keyState="NY/NJ")
## `summarise()` ungrouping output (override with `.groups` argument)
##  Date[1:249], format: "2020-02-13" "2020-02-14" "2020-02-15" "2020-02-16" "2020-02-17" ...
## NULL
##  Date[1:249], format: "2020-02-13" "2020-02-14" "2020-02-15" "2020-02-16" "2020-02-17" ...
## NULL

The data suggest that there is a fairly discernible “best lag” around the time that there are bona fide spikes in both cases and deaths. A challenge is that there does not appear to be a discernible “best lag” at other times when spikes may be driven by factors like increased testing and/or there is more random noise in the data then clear patterns with a strong correlation.

A next step is to look at the reverse of this process and use lead(deaths) rather than lag(cases). This will allow for looking at time periods where cases begin to change rapidly and seeing whether these drive rapid changes in deaths at any particular lead time.

The function is updated to allow for either lags or leads:

lagVectorWindows <- function(v1, 
                             v2, 
                             lagsTry=0:30, 
                             windowSize=30, 
                             minNoNA=ceiling(windowSize/2), 
                             updateStatus=FALSE, 
                             isLag=TRUE
                             ) {
    
    # FUNCTION ARGUMENTS:
    # v1: the first vector, which will be used 'as is'
    # v2: the second vector, which will be lagged/led by various values for lagsTry
    # lagsTry: the values for x that will be used in cor(v1, lag/lead(v2, x))
    # windowSize: the size of the window to use in taking snippets of v1 and lagged/led v2
    # minNoNA: minimum number of non-NA lagged/led values needed to calculate a correlation
    # updateStates: whether to print which window is being worked on
    # isLag: boolean, should a lag or a lead be applied (TRUE is lag, FALSE is lead)
    
    # Find the function to be used
    func <- if (isLag) lag else lead
    
    # Find the list of all possible window start points
    windowStarts <- 1:(length(v1)-windowSize+1)
    
    # Helper function to create a frame of correlations
    helperCorr <- function(s) {
        
        # Create the end point for the vector
        e <- s + windowSize - 1
        
        # Announce the start
        if (updateStatus) cat("\nProcessing window starting at:", s)
        
        # Create the correlations tibble for all values of lag, and return
        tibble::tibble(startWindow=s, endWindow=e, lags=lagsTry) %>%
            mutate(na1=sum(is.na(v1[s:e])), 
                   na2=sapply(lags, FUN=function(x) sum(is.na(func(v2, x)[s:e]))), 
                   p=sapply(lags, 
                            FUN=function(x) { 
                                ifelse(sum(!is.na(func(v2, x)[s:e])) < minNoNA,
                                       NA, 
                                       cor(v1[s:e], func(v2, x)[s:e], use="complete.obs")
                                       ) 
                                } 
                            )
                   )
    }

    # Bind the correlations frames and return
    map_dfr(windowStarts, .f=helperCorr)
    
}



# Function to assess correlations by lag/lead and window by state
stateCorr <- function(lst, 
                      keyState, 
                      met="vpm7", 
                      v1Name="CTP_deaths", 
                      v2Name="CTP_cases", 
                      windowSize=42, 
                      isLag=TRUE
                      ) {
    
    # FUNCTION ARGUMENTS:
    # lst: the processed list
    # keyState: the state of interest
    # met: the metric of interest
    # v1Name: the name of the first vector (this is considered fixed)
    # v2Name: the name of the second vector (this will have the lead/lag applied to it)
    # windowSize: number of days in the window
    # isLag: boolean, whether to use lag (TRUE) or lead (FALSE) on v2Name
    
    # Extract the data for the key State
    df <- lst[["dfList"]] %>%
        filter(state %in% keyState, !is.na(get(met))) %>%
        arrange(state, date, name)
    
    # Get the minimum date that is common to both
    minDate <- df %>%
        group_by(name) %>%
        summarize(date=min(date)) %>%
        pull(date) %>%
        max()
    
    # Extract v1 and v2
    v1 <- df %>% 
        filter(name==v1Name, date>=minDate) %>% 
        pull(met)
    v2 <- df %>% 
        filter(name==v2Name, date>=minDate) %>% 
        pull(met)

    # Confirm that dates are the same for both vectors
    dfDates1 <- df %>% filter(name==v1Name, date>=minDate) %>% pull(date)
    dfDates2 <- df %>% filter(name==v2Name, date>=minDate) %>% pull(date)
    if (!all.equal(dfDates1, dfDates2)) stop("\nDate mismatch\n")

    # Find the lags in the data
    dfLags <- lagVectorWindows(v1, v2, lagsTry=0:30, windowSize=windowSize, isLag=isLag) %>%
        mutate(windowStartDate=dfDates1[startWindow])

    # Give the description of the lag or lead
    descr <- ifelse(isLag, "lag (days)", "lead (days)")
    
    # Boxplot of correlations by lag
    p1 <- dfLags %>%
        filter(!is.na(p)) %>%
        ggplot(aes(x=factor(lags), y=p)) + 
        geom_boxplot(fill="lightblue") + 
        labs(x=stringr::str_to_title(descr), 
             y="Correlation", 
             title=paste0("Box plot of correlation by ", descr)
             )
    print(p1)

    # Plot of best lags by starting date
    p2 <- dfLags %>%
        filter(!is.na(p)) %>%
        group_by(startWindow) %>%
        filter(p==max(p)) %>%
        ggplot(aes(x=windowStartDate, y=lags)) + 
        geom_point(aes(size=p)) + 
        labs(x="Window start date", 
             y=paste0("Best ", descr), 
             title=paste0("Best ", descr, " by window starting date")
             ) + 
        scale_size_continuous(paste0("p at best ", stringr::str_replace(descr, " .*", ""))) + 
        scale_x_date(date_breaks="1 month", date_labels="%b")
    print(p2)
    
    # Plot of correlations by lag
    p3 <- dfLags %>%
        filter(!is.na(p)) %>%
        ggplot(aes(x=windowStartDate, y=lags)) + 
        geom_tile(aes(fill=p)) + 
        labs(x="Window start date", 
             y=stringr::str_to_title(descr), 
             title=paste0(stringr::str_to_title(descr), " by window starting date")
             ) + 
        scale_color_continuous(paste0("p at ", stringr::str_replace(descr, " .*", ""))) + 
        scale_x_date(date_breaks="1 month", date_labels="%b")
    print(p3)

    # Rename variable lags to leads if isLag is FALSE
    if (isFALSE(isLag)) dfLags <- dfLags %>%
        rename(leads=lags)
    
    # Return dfLags
    dfLags
    
}


miLeadData <- stateCorr(mwCurveList, keyState="MI", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)
## `summarise()` ungrouping output (override with `.groups` argument)

nynjLeadData <- stateCorr(neCurveList, keyState="NY/NJ", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)
## `summarise()` ungrouping output (override with `.groups` argument)

The leads continue to be more reliable in NY/NJ than in MI. The southern states with later disease outbreak can also be explored:

# Example for southern states
soCurveList <- createAndAlignCurves(stateData, 
                                    yVal="vpm7", 
                                    namesPlot=c("CTP_cases"),
                                    namesSec=c("CTP_deaths"), 
                                    keyStates=c("AZ", "FL", "GA", "TX"),
                                    plotTitle="2020 coronavirus burden per million per day (select states)", 
                                    plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                    primYLab="Cases per million (7-day rolling mean)",
                                    secYLab="Deaths per million (7-day rolling mean)",
                                    facetFixed=TRUE, 
                                    popData=usafPrepped$usafDemo, 
                                    printPlot=TRUE,
                                    lagValueMetric="vpm7", 
                                    lagDepName="CTP_deaths", 
                                    lagsTry=0:30
                                    )
## 
## Will scale by: 0.02075921

## Warning: Removed 3 row(s) containing missing values (geom_path).

assessStateCFR(soCurveList, 
               keyStates=c("AZ", "FL", "GA", "TX"), 
               depVar="CTP_deaths", 
               indepVar="CTP_cases", 
               depTitleName="deaths", 
               indepTitleName="cases"
               )
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
##   state earlyLag lateLag
##   <chr>    <dbl>   <dbl>
## 1 GA          10      20
## 2 TX          10      20
## 3 FL          10      20
## 4 AZ          10      20

## # A tibble: 876 x 13
##    state date       CTP_cases CTP_deaths earlyLag lateLag   eLag  lLag pctEarly
##    <chr> <date>         <dbl>      <dbl>    <dbl>   <dbl>  <dbl> <dbl>    <dbl>
##  1 AZ    2020-03-17      1.17     0            10      20 0.0837    NA        1
##  2 AZ    2020-03-18      1.92     0.0209       10      20 0.146     NA        1
##  3 AZ    2020-03-19      2.93     0.0418       10      20 0.146     NA        1
##  4 AZ    2020-03-20      5.17     0.0418       10      20 0.126     NA        1
##  5 AZ    2020-03-21      7.05     0.105        10      20 0.146     NA        1
##  6 AZ    2020-03-22      8.83     0.126        10      20 0.146     NA        1
##  7 AZ    2020-03-23     11.2      0.167        10      20 0.272     NA        1
##  8 AZ    2020-03-24     14.0      0.272        10      20 0.293     NA        1
##  9 AZ    2020-03-25     16.1      0.293        10      20 0.398     NA        1
## 10 AZ    2020-03-26     16.0      0.314        10      20 0.732     NA        1
## # ... with 866 more rows, and 4 more variables: x <dbl>, y <dbl>, mon <fct>,
## #   pred <dbl>
txLeadData <- stateCorr(soCurveList, keyState="TX", v1Name="CTP_cases", v2Name="CTP_deaths", isLag=FALSE)
## `summarise()` ungrouping output (override with `.groups` argument)

The lags and leads make the most sense around the areas where there is a significant spike in the data. The are fairly noisy otherwise. Does using a rolling average help with assessing leads?

txLeadRoll21 <- txLeadData %>%
    arrange(startWindow, leads) %>%
    group_by(leads) %>%
    mutate(leadroll7=zoo::rollmean(p, k=7, fill=NA), leadroll21=zoo::rollmean(p, k=21, fill=NA)) %>%
    ungroup() 

txLeadRoll21 %>%
    ggplot(aes(x=windowStartDate, y=leads)) + 
    geom_tile(aes(fill=leadroll21))

txLeadRoll21 %>%
    filter(!is.na(leadroll21)) %>%
    group_by(windowStartDate) %>%
    filter(leadroll21==max(leadroll21)) %>%
    ggplot(aes(x=windowStartDate, y=leads)) + 
    geom_point(aes(size=leadroll21))

In the main June-July time period when cases were climbing rapidly, the best lead appears to be in the 15-20 day range. The best lead at other times is not as clear, with noise in the data potentially drowning out the signal.

Data from late 2020 are integrated in to an overall state data file:

# Create the CTP and USAF files
ctpPrepped_210106 <- prepCTPData(readFromRDS("old_hier5_210101"))
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
usafPrepped_210106 <- prepUSAFData(readFromRDS("cty_old_20210104")$clusterStateData)
## `summarise()` regrouping output by 'state' (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
# Create an integrated state demographics file
demoData_210106 <- ctpPrepped_210106$ctpDemo %>%
    rename(popCTP=pop) %>%
    full_join(rename(usafPrepped_210106$usafDemo, popUSAF=pop), by="state") %>%
    mutate(pop=pmax(popCTP, popUSAF))

# Create the CDC file
cdcPrepped_210106 <- prepCDCData(readFromRDS("cdcList_20210105"), popData=demoData_210106)

# Create the integrated state data
stateData_210106 <- ctpPrepped_210106$ctpData %>%
    bind_rows(usafPrepped_210106$usafData) %>%
    bind_rows(cdcPrepped_210106$cdcData)
glimpse(stateData_210106)
## Rows: 126,003
## Columns: 9
## $ state  <chr> "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", "AK", ...
## $ date   <date> 2020-03-06, 2020-03-07, 2020-03-08, 2020-03-09, 2020-03-10,...
## $ metric <chr> "cases", "cases", "cases", "cases", "cases", "cases", "cases...
## $ source <chr> "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP", "CTP...
## $ name   <chr> "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cases", "CTP_cas...
## $ value  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 3, 3, 1, 10, 13, 4, 7...
## $ value7 <dbl> NA, NA, NA, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.00...
## $ vpm    <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, ...
## $ vpm7   <dbl> NA, NA, NA, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.00...
# Control totals
stateData_210106 %>%
    group_by(name) %>%
    summarize(value=sum(value, na.rm=TRUE), value7=sum(value7, na.rm=TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 8 x 3
##   name             value     value7
##   <chr>            <dbl>      <dbl>
## 1 CDC_deaths    2626487.   2575402.
## 2 CDC_excess     291802.    289563.
## 3 CTP_cases    19629177   19028416.
## 4 CTP_deaths     335129     326469.
## 5 CTP_hosp     14567264   14092648.
## 6 CTP_tests   249943721  245268720.
## 7 USAF_cases   20000027   19318617.
## 8 USAF_deaths    341896     333434.

The process is run for the selected states in the northeast:

# Example of combining states
ne_casedeath_210106 <- plotStateMetric(stateData_210106, 
                                       yVal="vpm7", 
                                       namesPlot=c("CTP_cases"),
                                       namesSec=c("CTP_deaths"), 
                                       keyStates=c("NY", "NJ", "MA", "CT", "RI", "NH", "VT", "ME"),
                                       combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", 
                                                    "NH"="N NE", "VT"="N NE", "ME"="N NE"
                                                    ),
                                       plotTitle="2020 coronavirus burden per million per day (select states)",
                                       plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                       primYLab="Cases per million (7-day rolling mean)",
                                       secYLab="Deaths per million (7-day rolling mean)",
                                       facetFixed=TRUE, 
                                       popData=usafPrepped_210106$usafDemo,
                                       returnData=TRUE
                                       )
## `summarise()` regrouping output by 'state', 'date' (override with `.groups` argument)
## 
## Will scale by: 0.04857715

ne_casedeath_210106
## # A tibble: 2,560 x 5
##    state date       name        vpm7     pop
##    <chr> <date>     <chr>      <dbl>   <dbl>
##  1 N NE  2020-03-03 CTP_cases     NA  626042
##  2 N NE  2020-03-03 CTP_deaths    NA  626042
##  3 N NE  2020-03-04 CTP_cases     NA 1956650
##  4 N NE  2020-03-04 CTP_deaths    NA 1956650
##  5 N NE  2020-03-05 CTP_cases     NA 1956650
##  6 N NE  2020-03-05 CTP_deaths    NA 1956650
##  7 N NE  2020-03-06 CTP_cases     NA 1956650
##  8 N NE  2020-03-06 CTP_deaths    NA 1956650
##  9 N NE  2020-03-07 CTP_cases     NA 3285978
## 10 N NE  2020-03-07 CTP_deaths    NA 3285978
## # ... with 2,550 more rows

The createAndAlignCurves() function is run for select northeast states:

# Example for northeastern states
neCurveList_210106 <- createAndAlignCurves(stateData_210106, 
                                           yVal="vpm7", 
                                           namesPlot=c("CTP_cases"),
                                           namesSec=c("CTP_deaths"), 
                                           keyStates=c("NY", "NJ", "MA", "CT", "RI", 
                                                       "NH", "VT", "ME", "DE", "DC"
                                                       ),
                                           combStates=c("MA"="S NE", "CT"="S NE", "RI"="S NE", 
                                                        "NH"="N NE", "VT"="N NE", "ME"="N NE", 
                                                        "NY"="NY/NJ", "NJ"="NY/NJ", 
                                                        "DE"="DE/DC", "DC"="DE/DC"
                                                        ),
                                           plotTitle="2020 coronavirus burden per million per day (select states)", 
                                           plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                           primYLab="Cases per million (7-day rolling mean)",
                                           secYLab="Deaths per million (7-day rolling mean)",
                                           facetFixed=TRUE, 
                                           popData=usafPrepped_210106$usafDemo, 
                                           printPlot=TRUE,
                                           lagValueMetric="vpm7", 
                                           lagDepName="CTP_deaths", 
                                           lagsTry=0:30
                                           )
## `summarise()` regrouping output by 'state', 'date' (override with `.groups` argument)
## 
## Will scale by: 0.04640771

## Warning: Removed 3 row(s) containing missing values (geom_path).

There is a clear difference in deaths per case that prevents full alignment of the curves using a single ratio and lag. The death rate as a function of confirmed cases is significantly higher in April and May than in late 2020.

The process is also run for select midwestern states:

# Example for midwestern states
mwCurveList_210106 <- createAndAlignCurves(stateData_210106, 
                                           yVal="vpm7", 
                                           namesPlot=c("CTP_cases"),
                                           namesSec=c("CTP_deaths"), 
                                           keyStates=c("OH", "MI", "IN", "IL"),
                                           plotTitle="2020 coronavirus burden per million per day (select states)", 
                                           plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                           primYLab="Cases per million (7-day rolling mean)",
                                           secYLab="Deaths per million (7-day rolling mean)",
                                           facetFixed=TRUE, 
                                           popData=usafPrepped_210106$usafDemo, 
                                           printPlot=TRUE,
                                           lagValueMetric="vpm7", 
                                           lagDepName="CTP_deaths", 
                                           lagsTry=0:30
                                           )
## 
## Will scale by: 0.01467959

## Warning: Removed 3 row(s) containing missing values (geom_path).

Differences in death rate over time are similarly observed in the midwest.

Additional states from the Upper Midwest / Great Plains are investigated:

# Example for select upper Midwest states
gpCurveList_210106 <- createAndAlignCurves(stateData_210106, 
                                           yVal="vpm7", 
                                           namesPlot=c("CTP_cases"),
                                           namesSec=c("CTP_deaths"), 
                                           keyStates=c("WI", "MN", "ND", "SD", "IA"),
                                           combStates=c("ND"="Dakotas", "SD"="Dakotas"),
                                           plotTitle="2020 coronavirus burden per million per day (select states)", 
                                           plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                           primYLab="Cases per million (7-day rolling mean)",
                                           secYLab="Deaths per million (7-day rolling mean)",
                                           facetFixed=TRUE, 
                                           popData=usafPrepped_210106$usafDemo, 
                                           printPlot=TRUE,
                                           lagValueMetric="vpm7", 
                                           lagDepName="CTP_deaths", 
                                           lagsTry=0:30
                                           )
## `summarise()` regrouping output by 'state', 'date' (override with `.groups` argument)
## 
## Will scale by: 0.01532896

## Warning: Removed 3 row(s) containing missing values (geom_path).

With these states having more pronounced late outbreaks with lesser early outbreaks the single multiplier is more effective at aligning the curves.

The process is run for select southern states:

# Example for southern states
soCurveList_210106 <- createAndAlignCurves(stateData_210106, 
                                           yVal="vpm7", 
                                           namesPlot=c("CTP_cases"),
                                           namesSec=c("CTP_deaths"), 
                                           keyStates=c("AZ", "FL", "GA", "TX"),
                                           plotTitle="2020 coronavirus burden per million per day (select states)", 
                                           plotSub="Cases on main y-axis, deaths on secondary y-axis", 
                                           primYLab="Cases per million (7-day rolling mean)",
                                           secYLab="Deaths per million (7-day rolling mean)",
                                           facetFixed=TRUE, 
                                           popData=usafPrepped_210106$usafDemo, 
                                           printPlot=TRUE,
                                           lagValueMetric="vpm7", 
                                           lagDepName="CTP_deaths", 
                                           lagsTry=0:30
                                           )
## 
## Will scale by: 0.01292161

## Warning: Removed 3 row(s) containing missing values (geom_path).

The curves align reasonably well with a single lag and death rate for these states.

Evolution of state CFR is run for the northeast states:

# Deaths vs. cases in northeast
assessStateCFR(neCurveList_210106, 
               keyStates=c("NY/NJ", "S NE", "DE/DC", "N NE"), 
               depVar="CTP_deaths", 
               indepVar="CTP_cases", 
               depTitleName="deaths", 
               indepTitleName="cases", 
               lagEarlyValue=NULL, 
               lagLateValue=NULL, 
               lagLateDate=as.Date("2020-10-31")
               )
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
##   state earlyLag lateLag
##   <chr>    <int>   <int>
## 1 DE/DC        2      12
## 2 N NE         3      25
## 3 NY/NJ        6      16
## 4 S NE         7      29

## Warning: Removed 4 rows containing missing values (position_stack).
## Warning: Removed 4 rows containing missing values (geom_text).

## # A tibble: 1,221 x 14
##    state date          pop CTP_cases CTP_deaths earlyLag lateLag  eLag  lLag
##    <chr> <date>      <dbl>     <dbl>      <dbl>    <int>   <int> <dbl> <dbl>
##  1 DE/DC 2020-03-11 1.62e6      1.41     0             2      12  1.24    NA
##  2 DE/DC 2020-03-12 1.62e6      1.85     0             2      12  1.24    NA
##  3 DE/DC 2020-03-13 1.62e6      2.12     0             2      12  1.41    NA
##  4 DE/DC 2020-03-14 1.62e6      2.91     0             2      12  1.85    NA
##  5 DE/DC 2020-03-15 1.62e6      4.41     0             2      12  2.12    NA
##  6 DE/DC 2020-03-16 1.62e6      4.86     0             2      12  2.91    NA
##  7 DE/DC 2020-03-17 1.62e6      8.39     0.0883        2      12  4.41    NA
##  8 DE/DC 2020-03-18 1.62e6      9.36     0.0883        2      12  4.86    NA
##  9 DE/DC 2020-03-19 1.62e6     11.7      0.0883        2      12  8.39    NA
## 10 DE/DC 2020-03-20 1.62e6     14.0      0.177         2      12  9.36    NA
## # ... with 1,211 more rows, and 5 more variables: pctEarly <dbl>, x <dbl>,
## #   y <dbl>, mon <fct>, pred <dbl>

Among northeastern states, CFR has generally declined since the early pandemic while lag between case spikes and death spikes has increased. This is consistent with increased detection of milder disease and/or earlier detection of disease (e.g., in community rather than in ER) and/or improved treating disease treatments.